library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr)
#data <- read.csv(file="C:/Users/craig/source/repos/rcode/duck_river_rate_study/2018.01.csv", header=TRUE, sep=",")
data <- read_csv("C:/Users/craig/source/repos/rcode/duck_river_rate_study/2018.01.csv")
## Parsed with column specification:
## cols(
## h.ReadDate = col_datetime(format = ""),
## i.ReadDate = col_datetime(format = ""),
## Readvalue = col_double(),
## Uom = col_character(),
## ReadQualityCode = col_integer(),
## RecordInterval = col_integer(),
## VeeFlag = col_character(),
## ChannelStatus = col_character(),
## SubstationName = col_character(),
## MeterIdentifier = col_integer(),
## LocationNumber = col_integer()
## )
# fxnInterval_from_datetime numbers the date with a interger as ending intervals
fxnInterval_from_datetime <- function(dt) {
#TODO: make this a parameter
divisor = 15
interval = 96
h = hour(dt)
m = minute(dt)
interval = ( (h * 60) / divisor ) + ( m / divisor )
if (interval == 0) {
interval = 96
}
return(interval)
}
# TODO: Would be nice to have something to return HHMM and make factors
# This will be easy to read for users and if we factor them or index it
# will plot correctly. Might run faster too.
fxn_hour_minute <- function(dt) {
dt <- format(as.POSIXct(dt,
format="%Y-%m-%d %H:%M"),
format="%H:%M")
return (dt)
}
fxn_histogram_plot <- function(df,LocationNumber) {
p1 <- ggplot(df,aes(x = Readvalue)) +
geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
ggtitle(paste("Histogram of kWh for Location ",LocationNumber, " (bin=0.05)"))
# p2 <- ggplot(subset(data,data$LocationNumber == LocationNumber),aes(x = Readvalue)) +
# geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
# scale_x_sqrt() +
# xlab("sqrt(ReadValue)") +
# ggtitle(paste("Histogram of kWh for Location (bin=0.05)",LocationNumber, ""))
grid.arrange(p1, ncol=1)
}
fxn_daily_scatter_plot <- function(df,LocationNumber) {
ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
geom_line(alpha = 1/2) +
xlab("Time") +
ylab("kWh") +
ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
}
fxn_dow_scatter_plot <- function(df,LocationNumber) {
plt <- ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
geom_line(alpha = 1/2) +
facet_wrap(~weekday, ncol=7) +
xlab("Time") +
ylab("kWh") +
ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
return(plt)
}
fxn_day_scatter_plot <- function(df,LocationNumber) {
plt <- ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
geom_line(alpha = 1/2) +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("kWh") +
ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
return(plt)
}
fxn_scaled_plot <- function(locationNumber) {
p1 <- ggplot(subset(data_summary,data_summary$LocationNumber == locationNumber), aes(x=hhmm, y=mean)) +
geom_point() +
xlab("Time") +
ylab("mean") +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
#scale_x_date(breaks = data_summary$hhmm[seq(1, length(data_summary$hhmm), by = 4)]) +
ggtitle(paste("Load Shapes | January 2018 | ",locationNumber, ""))
p2 <- ggplot(subset(data_summary,data_summary$LocationNumber == locationNumber), aes(x=hhmm, y=scaled_mean)) +
geom_point() +
xlab("Time") +
ylab("scaled_mean") +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
ggtitle(paste("Load Shapes | January 2018 | ",locationNumber, ""))
grid.arrange(p1, p2, ncol=2)
}
# TODO FIX NAME
fxn_scaled_plot_facet_wrap <- function(locationNumber) {
p1 <- ggplot(subset(data_summary,data_summary$LocationNumber %in% locationNumber), aes(x=hhmm, y=mean,group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
xlab("Time") +
ylab("mean") +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
ggtitle(paste("Load Shapes | January 2018"))
p2 <- ggplot(subset(data_summary,data_summary$LocationNumber %in% locationNumber), aes(x=hhmm, y=scaled_mean,group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
xlab("Time") +
ylab("scaled_mean") +
theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
ggtitle(paste("Load Shapes | January 2018"))
grid.arrange(p1, p2, ncol=2)
}
# test$i.ReadDate
#
# for (i in 1:1){
# interval <- fxnInterval_from_datetime(dt)
# hhmm <- fxn_hour_minute(dt)
# print(sprintf("%s is time: %s %i", dt, hhmm, interval))
# #print(sprintf("%s",hhmm))
# dt <- dt + minutes(15)
# }
Customer Wants us to find similar accounts from the same rate codes as the profiles provided. - January - 2018 - kWh
str(data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 40135207 obs. of 11 variables:
## $ h.ReadDate : POSIXct, format: "2018-01-01" "2018-01-01" ...
## $ i.ReadDate : POSIXct, format: "2018-01-01 00:15:00" "2018-01-01 00:30:00" ...
## $ Readvalue : num 2.34 2.27 2.27 2.5 2.29 ...
## $ Uom : chr "KWH" "KWH" "KWH" "KWH" ...
## $ ReadQualityCode: int 10 3 3 3 3 3 3 3 3 3 ...
## $ RecordInterval : int 900 900 900 900 900 900 900 900 900 900 ...
## $ VeeFlag : chr "False" "False" "False" "False" ...
## $ ChannelStatus : chr NA NA NA NA ...
## $ SubstationName : chr "WARTRACE" "WARTRACE" "WARTRACE" "WARTRACE" ...
## $ MeterIdentifier: int 300029 300029 300029 300029 300029 300029 300029 300029 300029 300029 ...
## $ LocationNumber : int 3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 11
## .. ..$ h.ReadDate :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
## .. ..$ i.ReadDate :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
## .. ..$ Readvalue : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Uom : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ ReadQualityCode: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ RecordInterval : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ VeeFlag : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ ChannelStatus : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ SubstationName : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ MeterIdentifier: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ LocationNumber : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
## # A tibble: 6 x 11
## h.ReadDate i.ReadDate Readvalue Uom ReadQualityCode
## <dttm> <dttm> <dbl> <chr> <int>
## 1 2018-01-01 00:00:00 2018-01-01 00:15:00 2.34 KWH 10
## 2 2018-01-01 00:00:00 2018-01-01 00:30:00 2.27 KWH 3
## 3 2018-01-01 00:00:00 2018-01-01 00:45:00 2.27 KWH 3
## 4 2018-01-01 00:00:00 2018-01-01 01:00:00 2.50 KWH 3
## 5 2018-01-01 00:00:00 2018-01-01 01:15:00 2.29 KWH 3
## 6 2018-01-01 00:00:00 2018-01-01 01:30:00 2.31 KWH 3
## # ... with 6 more variables: RecordInterval <int>, VeeFlag <chr>,
## # ChannelStatus <chr>, SubstationName <chr>, MeterIdentifier <int>,
## # LocationNumber <int>
## h.ReadDate i.ReadDate
## Min. :2018-01-01 00:00:00 Min. :2018-01-01 00:00:00
## 1st Qu.:2018-01-08 00:00:00 1st Qu.:2018-01-08 19:30:00
## Median :2018-01-16 00:00:00 Median :2018-01-16 14:00:00
## Mean :2018-01-16 01:20:56 Mean :2018-01-16 13:20:32
## 3rd Qu.:2018-01-24 00:00:00 3rd Qu.:2018-01-24 07:30:00
## Max. :2018-01-31 00:00:00 Max. :2018-02-01 00:00:00
## Readvalue Uom ReadQualityCode RecordInterval
## Min. : -0.0375 Length:40135207 Min. :-1.000 Min. :300.0
## 1st Qu.: 0.0950 Class :character 1st Qu.: 3.000 1st Qu.:900.0
## Median : 0.3845 Mode :character Median : 3.000 Median :900.0
## Mean : 0.7188 Mean : 2.973 Mean :899.9
## 3rd Qu.: 0.9858 3rd Qu.: 3.000 3rd Qu.:900.0
## Max. :263.9831 Max. :10.000 Max. :900.0
## VeeFlag ChannelStatus SubstationName MeterIdentifier
## Length:40135207 Length:40135207 Length:40135207 Min. : 36920
## Class :character Class :character Class :character 1st Qu.:301064
## Mode :character Mode :character Mode :character Median :401426
## Mean :392836
## 3rd Qu.:405627
## Max. :905071
## LocationNumber
## Min. : 10260
## 1st Qu.: 480140
## Median :2110910
## Mean :2809667
## 3rd Qu.:4243280
## Max. :9290400
#data$dtReadDate <- parse_date_time(data$i.ReadDate, orders="ymd HMS")
#data$dtReadDay <- parse_date_time(data$h.ReadDate, orders="ymd HMS")
data$month <- month(data$h.ReadDate)
data$week <- week(data$h.ReadDate)
data$weekday <- wday(data$h.ReadDate, label = TRUE)
data$h <- hour(data$i.ReadDate) # this needs to be rolled back by 15min to be correct
data$hhmm <- fxn_hour_minute(data$i.ReadDate)
# What about a grouping flag for weekday vs weekend
weekday_levels <- c('Mon', 'Tue', 'Wed', 'Thu', 'Fri')
data$isWeekDay <- factor((weekdays(data$h.ReadDate, abbreviate = TRUE )
%in% weekday_levels),
levels=c(FALSE, TRUE), labels=c('Weekend', 'Weekday'))
#day_of_week_levels <- c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
#min(test$weekday)
#[1] Mon
#Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
hhmm_levels <- c("00:15","00:30","00:45","01:00",
"01:15","01:30","01:45","02:00",
"02:15","02:30","02:45","03:00",
"03:15","03:30","03:45","04:00",
"04:15","04:30","04:45","05:00",
"05:15","05:30","05:45","06:00",
"06:15","06:30","06:45","07:00",
"07:15","07:30","07:45","08:00",
"08:15","08:30","08:45","09:00",
"09:15","09:30","09:45","10:00",
"10:15","10:30","10:45","11:00",
"11:15","11:30","11:45","12:00",
"12:15","12:30","12:45","13:00",
"13:15","13:30","13:45","14:00",
"14:15","14:30","14:45","15:00",
"15:15","15:30","15:45","16:00",
"16:15","16:30","16:45","17:00",
"17:15","17:30","17:45","18:00",
"18:15","18:30","18:45","19:00",
"19:15","19:30","19:45","20:00",
"20:15","20:30","20:45","21:00",
"21:15","21:30","21:45","22:00",
"22:15","22:30","22:45","23:00",
"23:15","23:30","23:45","00:00")
data$hhmm <- factor(data$hhmm, levels = hhmm_levels,ordered = TRUE)
# We can add more group by's if needed
# group_by(LocationNumber, h.ReadDate) %>%
data <-
data %>%
group_by(LocationNumber) %>%
mutate(max.per.group = max(as.numeric(Readvalue)))
Allows us to compare locations with out scaling issues
# Fingers cross this gives us like values between 0 and 1 for all meters
data$scale_by_monthmax <- data$Readvalue / data$max.per.group
review <- subset(data, data$h.ReadDate < as.POSIXct("2018-01-01") & data$LocationNumber == 4191826)
#data$interval <- fxnInterval_from_datetime(data$dtReadDate)
# Should dtReadDay, h, are anyother field be usable here? weekday at least since Sat and Sun are not diff load shapes
data_summary <- data %>%
group_by(hhmm, LocationNumber, MeterIdentifier, Uom) %>%
summarise(mean = mean(Readvalue),
median = median(as.numeric(Readvalue)),
min = min(Readvalue),
max = max(Readvalue),
total = sum(Readvalue),
std = sd(Readvalue),
scaled_mean = mean(scale_by_monthmax),
n = n()) %>%
arrange(hhmm, LocationNumber, MeterIdentifier, Uom)
To be used for removing dirty data.
# Should dtReadDay, h, are anyother field be usable here? weekday at least since Sat and Sun are not diff load shapes
data_total <- data_summary %>%
group_by(LocationNumber, MeterIdentifier) %>%
summarise(intervals = sum(n),
expected = as.numeric(2976)) %>%
arrange(LocationNumber, MeterIdentifier)
# get the unique locations
x <- unique(data$LocationNumber)
locationNumber <- data.frame(x)
summary(locationNumber)
## x
## Min. : 10260
## 1st Qu.: 480116
## Median :2091322
## Mean :2796033
## 3rd Qu.:4242918
## Max. :9290400
sample <- locationNumber[sample(nrow(locationNumber), 5), ]
sample <- c(sample, '4191826')
length(sample)
## [1] 6
length(unique(sample))
## [1] 6
# Get the data just for the unique locations
sampleData <- subset(data, data$LocationNumber %in% sample)
count(subset(data_total, data_total < 2976))
## Warning: Length of logical index must be 1 or 13728, not 54912
## # A tibble: 1 x 2
## # Groups: LocationNumber [1]
## LocationNumber n
## <int> <int>
## 1 NA 2399
data_summary$scaled_value_max <- data_summary$mean/max(data_summary$mean)
data_summary$scaled_value_sd <- data_summary$mean/data_summary$std
# x <- subset(data_summary,data_summary$LocationNumber == 11280)
#
# write.table(x, file = "sample2.csv",
# sep = ",",
# col.names = NA,
# qmethod = "double")
Data is left shifted, and we have a long right tail. Data is skewed by a few meters.
Data has locations that is not represntative of the entire dataset.
Data is not normally distributed.
#h1 <-
# ggplot(data,aes(x = data$Readvalue)) +
# geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
# scale_x_continuous(limits=c(0,50)) +
# ggtitle("Histogram of kWh (bin=0.05)")
#h2 <- h1 + scale_x_log10()
#h3 <- h1 + scale_x_sqrt()
#h4 <- h1 + coord_trans('sqrt')
We saw above the locations for 4191826 and 1751219 have similar mean load shapes. Let’s see if they have similar day to day load shapes for the read values.
locations <- c(4191826,1751219 )
df <- subset(data, data$LocationNumber %in% locations)
ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("scaled kWh (ReadValue/max(ReadValue")
ggplot(df,aes(x=hhmm, y=Readvalue, group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("kWh")
rm(df)
locations <- c(4191826,4066442)
df <- subset(data, data$LocationNumber %in% locations)
ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("scaled kWh (ReadValue/max(ReadValue") +
ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))
locations <- c(70220,535450)
df <- subset(data, data$LocationNumber %in% locations)
ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("scaled kWh (ReadValue/max(ReadValue") +
ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))
fxn_scaled_plot_facet_wrap(locations)
locations <- c(1751219,4706060)
df <- subset(data, data$LocationNumber %in% locations)
ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = factor(LocationNumber))) +
geom_line() +
facet_wrap(h.ReadDate ~ weekday, ncol=7) +
xlab("Time") +
ylab("scaled kWh (ReadValue/max(ReadValue") +
ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))
fxn_scaled_plot_facet_wrap(locations)
# ggplot(data_summary, aes(x=hhmm, y=mean)) +
# geom_point(size=2, shape=23) +
# theme(axis.text.x = element_text(angle = -90, hjust = 0))
# ggplot(model_data_one_mean, aes(x=hhmm, y=log(mean))) +
# geom_point(size=2, shape=23) +
# theme(axis.text.x = element_text(angle = -90, hjust = 0))
Normailizing the data so all load profiles are simular.
We do this by taking the max value and dividing all values by the max. This allows us to compare all locations (timeseries objects) without focusing on the total consumption.
TBD
What we need is something like this:
LocationNumber : {scaled_value1, …. scaled_value2} - Possible additional factors, Sub, RateCode, Weekday, hhmm, GIS Location???
White-paper
Predicting Consumer Load Profiles Using Commercial and Open Data Dauwe Vercamer, Bram Steurtewagen, Dirk Van den Poel, Senior Member, IEEE, and Frank Vermeulen
This paper addresses the issue of assigning new customers, for whom no AMI readings are available, to one of these load profiles. This post-clustering phase has received little attention in the past
Step: Identify and remove outliers from eaplot_by_interval_by_day_4191826ch location. Identify and check on 0 values, and remove. Identify and correct or remove duplicate intervals… identify and remove locations with not enough data.
Example data found - 33 days in the month (Whats the cause, how to clean) - 1 day avaiable in the month (whats the cause, how to clean or remove)
Quick Goal: Focus on perfect data for the first run. Clean up will be a larger effort after the model is created. So… 31 days of data, for each hour of every single day. Should be 31*96 = 2976 intervals per meter/location.
Long Term Goal: - Remove outliers (I’m seeing one on the first meter already)
Step 1: Identify the load profiles. check for outliers in the first metering point location. See if the chosen load profile is accurate enough to generate a model.
LP can be Daily Weekly Monthly
Question: how to compare the load shapes??????? Maybe - an average daily 15 minute pattern (96 measurements per customer). Average over what time frame, week, or month? Season? Location -> pivoted data… or columar?
Location, ReadValue1, …. ReadValue96
or
Question: Also, how many unique / relative profiles exist in the Duck River kWh data? Question: How to handled weekdays and weekends? Include or Exclude from the means? Question: Do we want a profile for day or profile for weekdays and weekends?
Question: Can we use person’s to determine if locations are correlted? Is this even correct to do here? See Lesson4_student.html examples
Rate Codes: Do we have more unique profiles which require more rate codes, or do the rate codes we have, for each location exhibit the same load profile shapes for their current assigned rate code?
Attributes? What are the attributes that make up a load profile for Duck River’s data? Rate Code? Meter Type? Weather?
Steps - Identify Outliers - Remove Outliers - Aggregate to daily patterns / hourly partners - Filter time series to remove missing values - Normalize Time Series - remove estimates… etc…
Additional Steps Select a time series to be clustered perform spectral clustering perform k-means calculate davis-bouldin index calculate dunn index best rank of dun and d-b
Rate 22 All electric, 400A service, residential account Location 4191826, meter is 300071 and Account # 303606-002 Location 70220, meter is 303099 and account is 324749-001
All electric 200A service Location 1751219 Location 1522548 (has a scope and other out buildings) Account #:
All electric 600A service and geothermal HVAC Location 226760 -
Electric and gas or wood head 200A service Location 1630115 Location 1530057
Find all the accounts that have similar load shape.
kernlab package for clustering ada and Random Forest for classification https://rpubs.com/FelipeRego/K-Means-Clustering https://www.r-bloggers.com/clustering-mixed-data-types-in-r/ https://shiny.rstudio.com/gallery/kmeans-example.html https://towardsdatascience.com/how-to-cluster-your-customer-data-with-r-code-examples-6c7e4aa6c5b1 https://uc-r.github.io/kmeans_clustering https://robjhyndman.com/TSDL/
http://readr.tidyverse.org/reference/read_delim.html
https://escience.rpi.edu/data/DA/svmbasic_notes.pdf
https://cran.r-project.org/web/packages/kernlab/kernlab.pdf
https://stats.stackexchange.com/questions/142400/quantifying-similarity-between-two-data-sets
https://www.rdocumentation.org/packages/SimilarityMeasures/versions/1.4/topics/LCSS